last changed: 2023-08-28


Feature annotation of H3K27ac peaks

Data processing

Load the rlog transformed DESeq data set and construct the required data frames. The function assay returns the values and colData returns the information about the experimental design.

> load(file = "./../Clustering/Data/rlog_H3K27ac.rda")
> 
> m.H3.ac <- assay(rlog.heat)
> paged_table(m.H3.ac %>% as.data.frame())
> info.H3.ac <- data.frame(colData(rlog.heat))
> paged_table(info.H3.ac)

Next the GRanges object can be constructed using the rownames of the DESeq data set, which contain information about the location of each peak.

> chr <- str_split_i(rownames(m.H3.ac), ":", 1)
> pos <- str_split_i(rownames(m.H3.ac), ":", 2)
> start <- as.integer(str_split_i(pos, "-", 1))
> end <- as.integer(str_split_i(rownames(m.H3.ac), "-", 2))
> 
> peaks.H3ac <- GRanges(seqnames = chr, 
+                         ranges =IRanges(start = start, end = end))
> 
> saveRDS(peaks.H3ac, "./Data/peaks_H3ac.rds")
> paged_table(peaks.H3ac %>% as.data.frame())

To visualize the distance to TSS the by cluster, a list containing a entry for
each cluster with the corresponding peaks is required. Thus, the clustering results are loaded and used to filter the GRanges object.

> kmeans.k3 <- readRDS("./../Clustering/Data/kmeans.rds")
> 
> clus.kmeans <- melt(kmeans.k3$cluster)
> clus.kmeans <- cbind(clus.kmeans, 1:nrow(clus.kmeans))
> colnames(clus.kmeans) <- c("cluster", "ind")
> clus.kmeans <- clus.kmeans[order(clus.kmeans[, "cluster"]), ]
> paged_table(clus.kmeans)
> ind.cluster1 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 1), "ind"]
> ind.cluster2 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 2), "ind"]
> ind.cluster3 <- clus.kmeans[which(clus.kmeans[, "cluster"] == 3), "ind"]
> 
> peaks.k1 <- peaks.H3ac[ind.cluster1, ]
> peaks.k2 <- peaks.H3ac[ind.cluster2, ]
> peaks.k3 <- peaks.H3ac[ind.cluster3, ]
> 
> peaks <- list(cluster_1 = peaks.k1, cluster_2 = peaks.k2, cluster_3 = peaks.k3)
> head(peaks)
## $cluster_1
## GRanges object with 2799 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]    chr14   64840548-64843743      *
##      [2]    chr14   64844047-64844598      *
##      [3]     chr1 116508159-116509891      *
##      [4]     chr2   58040368-58048461      *
##      [5]     chr5   37667044-37670097      *
##      ...      ...                 ...    ...
##   [2795]     chr1 181133649-181136568      *
##   [2796]     chr8 127734251-127741103      *
##   [2797]     chr8 127742153-127742995      *
##   [2798]    chr11     8831934-8834702      *
##   [2799]    chr20   48846149-48847290      *
##   -------
##   seqinfo: 28 sequences from an unspecified genome; no seqlengths
## 
## $cluster_2
## GRanges object with 1517 ranges and 0 metadata columns:
##          seqnames              ranges strand
##             <Rle>           <IRanges>  <Rle>
##      [1]    chr14   34718145-34719716      *
##      [2]    chr10   31099003-31099816      *
##      [3]     chr7     6445995-6449792      *
##      [4]     chr8 117520018-117521481      *
##      [5]    chr12   65946466-65950178      *
##      ...      ...                 ...    ...
##   [1513]     chr8 142232614-142237657      *
##   [1514]     chrX   72269124-72270912      *
##   [1515]     chr7   73847739-73848606      *
##   [1516]    chr15   40306527-40309992      *
##   [1517]     chr5   89929848-89931827      *
##   -------
##   seqinfo: 28 sequences from an unspecified genome; no seqlengths
## 
## $cluster_3
## GRanges object with 1984 ranges and 0 metadata columns:
##          seqnames            ranges strand
##             <Rle>         <IRanges>  <Rle>
##      [1]    chr14 64879344-64880265      *
##      [2]    chr20 43656766-43658787      *
##      [3]    chr20 43666580-43668593      *
##      [4]    chr13 51802460-51805452      *
##      [5]    chr10 31030898-31035191      *
##      ...      ...               ...    ...
##   [1980]    chr22 49825757-49828338      *
##   [1981]     chr8 56209657-56211303      *
##   [1982]    chr20 48826483-48829133      *
##   [1983]    chr14 73136349-73137361      *
##   [1984]    chr22 33919379-33921043      *
##   -------
##   seqinfo: 28 sequences from an unspecified genome; no seqlengths

Average distance to TSS

The GRanges list can then be used to plot the average distance to the TSS by cluster.

> hg38 <- TxDb.Hsapiens.UCSC.hg38.knownGene
> promoter <- getPromoters(TxDb = hg38, upstream=3000, downstream=3000)
> tagM.peaks <- lapply(peaks, getTagMatrix, windows=promoter)
## >> preparing start_site regions by gene... 2023-08-28 18:12:12
## >> preparing tag matrix...  2023-08-28 18:12:12 
## >> preparing start_site regions by gene... 2023-08-28 18:12:47
## >> preparing tag matrix...  2023-08-28 18:12:47 
## >> preparing start_site regions by gene... 2023-08-28 18:13:17
## >> preparing tag matrix...  2023-08-28 18:13:17
> plotAvgProf(tagM.peaks, TxDb = hg38, xlim=c(-3000, 3000), 
+              xlab = "Genomic Region (5'->3')", ylab = "Peak Count") +
+   scale_color_manual(values = brewer.pal(n = 3, name = "Dark2"))
## >> plotting figure...             2023-08-28 18:13:48


Feature annotation

Next, the peaks were annotated to genes using annotatePeak and all known human genes. The resulting object was then plotted with plotAnnoBar, to visualize the distribution of features among peaks.

> anno.list <- lapply(peaks, annotatePeak, TxDb = hg38, 
+                          tssRegion=c(-3000, 3000), verbose=FALSE)
> 
> plotAnnoBar(anno.list)


Enrichment Analysis

Data processing

For the enrichment analysis the peaks were annotated to only coding genes within a range of -1 to 1 kb from the transcription start site (TSS). For this the peaks are annotated to using only the protein coding genes.

> TxDb.protein.coding <- loadDb( file="./Data/TxDb.protein.coding.sqlite")
> seqlevelsStyle(TxDb.protein.coding) <- "UCSC"
> 
> peak.coding <- lapply(peaks, annotatePeak, TxDb = TxDb.protein.coding, 
+                       tssRegion = c(-3000, 3000), verbose=FALSE)

Next, the following function was used to translate the ENSEMBL identifiers to ENTREZID.

> # generate required structure:
> conv2table<-function(x){
+   eg = bitr(x@anno$geneId, fromType="ENSEMBL", toType="SYMBOL", OrgDb="org.Hs.eg.db")
+   ez = bitr(x@anno$geneId, fromType="ENSEMBL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
+   anno.table<-x@anno
+   #add gene symbols
+   mx<-match(anno.table$geneId, eg$ENSEMBL)
+   anno.table$gene_symbol<-eg[mx,"SYMBOL"]
+   #add gene entrezID
+   mx<-match(anno.table$geneId, ez$ENSEMBL)
+   anno.table$gene_entrez<-ez[mx,"ENTREZID"]
+   
+   res.table<-cbind(
+     chr=as.vector(seqnames(anno.table)),
+     start=start(anno.table),
+     end=end(anno.table),
+     name=anno.table$name,
+     peak.p_adj=anno.table$qValue,
+     strand=rep(".", length(anno.table)),
+     genome_anno=anno.table$annotation,
+     gene_symbol=anno.table$gene_symbol,
+     gene_entrez=anno.table$gene_entrez,
+     distanceToTSS=anno.table$distanceToTSS,
+     transcriptId=anno.table$transcriptId,
+     geneId_ensembl=anno.table$geneId)
+ }
> 
> peak.coding.ID <- lapply(peak.coding, function(n) conv2table(n))

The resulting list was then filtered for peaks which are within a range of -1 to 1 kb from the TSS.

> peak.coding.1kb <- lapply(peak.coding.ID, function(x) 
+   filter(as.data.frame(x), 
+          abs(as.integer(as.data.frame(x)$distanceToTSS)) <= 1000))
> 
> saveRDS(peak.coding.1kb, file = "./Data/peaks_H3ac_cod1kb.rds")
> lapply(peak.coding.1kb, head)
## $cluster_1
##     chr     start       end strand      genome_anno gene_symbol gene_entrez
## 1  chr2  58040368  58048461      . Promoter (<=1kb)        VRK2        7444
## 2  chr4 113977892 113980658      . Promoter (<=1kb)        ARSJ       79642
## 3 chr12 103929052 103930016      . Promoter (<=1kb)     HSP90B1        7184
## 4 chr12 103930337 103931619      . Promoter (<=1kb)     HSP90B1        7184
## 5  chr4 159282053 159285899      . Promoter (<=1kb)     RAPGEF2        9693
## 6  chr9  69375779  69378259      . Promoter (<=1kb)     ENTREP1        9413
##   distanceToTSS    transcriptId  geneId_ensembl
## 1             0 ENST00000412104 ENSG00000028116
## 2             0 ENST00000315366 ENSG00000180801
## 3           -91 ENST00000549334 ENSG00000166598
## 4             0 ENST00000680391 ENSG00000166598
## 5             0 ENST00000514565 ENSG00000109756
## 6             0 ENST00000645516 ENSG00000135063
## 
## $cluster_2
##     chr     start       end strand      genome_anno gene_symbol gene_entrez
## 1  chr7   6445995   6449792      . Promoter (<=1kb)       DAGLB      221955
## 2  chr8 117520018 117521481      . Promoter (<=1kb)       MED30       90390
## 3  chr1 204213358 204214078      . Promoter (<=1kb)      GOLT1A      127845
## 4  chr9 128455194 128457511      . Promoter (<=1kb)        ODF2        4957
## 5 chr17  78167924  78169666      . Promoter (<=1kb)      SYNGR2        9144
## 6 chr10  63266723  63270752      . Promoter (<=1kb)      JMJD1C      221037
##   distanceToTSS    transcriptId  geneId_ensembl
## 1             0 ENST00000297056 ENSG00000164535
## 2             0 ENST00000297347 ENSG00000164758
## 3             0 ENST00000308302 ENSG00000174567
## 4             0 ENST00000351030 ENSG00000136811
## 5             0 ENST00000225777 ENSG00000108639
## 6             0 ENST00000402544 ENSG00000171988
## 
## $cluster_3
##     chr    start      end strand      genome_anno gene_symbol gene_entrez
## 1 chr14 64879344 64880265      . Promoter (<=1kb)        SPTB        6710
## 2 chr20 43666580 43668593      . Promoter (<=1kb)       MYBL2        4605
## 3 chr13 51802460 51805452      . Promoter (<=1kb)      DHRS12       79758
## 4 chr10 31030898 31035191      . Promoter (<=1kb)      ZNF438      220929
## 5 chr19 18519575 18522156      . Promoter (<=1kb)         ELL        8178
## 6 chr17 30969849 30972583      . Promoter (<=1kb)      RNF135       84282
##   distanceToTSS    transcriptId  geneId_ensembl
## 1             0 ENST00000644917 ENSG00000070182
## 2             0 ENST00000396863 ENSG00000101057
## 3             0 ENST00000650833 ENSG00000102796
## 4             0 ENST00000436087 ENSG00000183621
## 5             0 ENST00000262809 ENSG00000105656
## 6             0 ENST00000443677 ENSG00000181481
> names(peak.coding.1kb) <- c("1", "2", "3")
> cod.genes.1kb <- lapply(peak.coding.1kb, function(i) as.data.frame(i)$gene_entrez)

Reactome pathway analysis enrichment analysis

Finally, the filtered list with the coding genes can be used for an enrichment analysis using the function enrichPathway, which can be plotted using dotplot.

> comp.Reac <- compareCluster(geneCluster = cod.genes.1kb,
+                             fun = "enrichPathway",
+                             pvalueCutoff = 0.05,
+                             pAdjustMethod = "BH")
> paged_table(comp.Reac %>% as.data.frame())
> dotplot(comp.Reac, showCategory = 10, 
+         title = "Reactome Pathway Enrichment Analysis (coding genes)")


Gene Ontology enrichment analysis

Exactly the same as above can be done with the function enrichGO.

> comp.GO <- compareCluster(geneCluster = cod.genes.1kb,
+                           fun = "enrichGO",
+                           OrgDb = org.Hs.eg.db,
+                           ont = "BP",
+                           readable = T,
+                           pvalueCutoff = 0.05,
+                           pAdjustMethod = "BH")
> paged_table(comp.GO %>% as.data.frame())
> dotplot(comp.GO, showCategory = 10, 
+         title = "GO Enrichment Analysis (coding genes)")

References - packages


Session Information

The following versions of R and R packages were used to generate the report above:

## R version 4.3.0 (2023-04-21)
## Platform: x86_64-apple-darwin20 (64-bit)
## Running under: macOS Ventura 13.4
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib;  LAPACK version 3.11.0
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Europe/Berlin
## tzcode source: internal
## 
## attached base packages:
## [1] stats4    stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
##  [1] reshape2_1.4.4                          
##  [2] ggplot2_3.4.2                           
##  [3] dplyr_1.1.2                             
##  [4] RColorBrewer_1.1-3                      
##  [5] ReactomePA_1.44.0                       
##  [6] org.Hs.eg.db_3.17.0                     
##  [7] clusterProfiler_4.8.1                   
##  [8] TxDb.Hsapiens.UCSC.hg38.knownGene_3.17.0
##  [9] GenomicFeatures_1.52.0                  
## [10] AnnotationDbi_1.62.1                    
## [11] DESeq2_1.40.1                           
## [12] SummarizedExperiment_1.30.2             
## [13] Biobase_2.60.0                          
## [14] MatrixGenerics_1.12.2                   
## [15] matrixStats_1.0.0                       
## [16] GenomicRanges_1.52.0                    
## [17] GenomeInfoDb_1.36.0                     
## [18] IRanges_2.34.0                          
## [19] S4Vectors_0.38.1                        
## [20] BiocGenerics_0.46.0                     
## [21] ChIPseeker_1.36.0                       
## [22] stringr_1.5.0                           
## [23] report_0.5.7                            
## [24] rmarkdown_2.23                          
## [25] knitr_1.43                              
## 
## loaded via a namespace (and not attached):
##   [1] splines_4.3.0                          
##   [2] BiocIO_1.10.0                          
##   [3] bitops_1.0-7                           
##   [4] ggplotify_0.1.0                        
##   [5] filelock_1.0.2                         
##   [6] tibble_3.2.1                           
##   [7] polyclip_1.10-4                        
##   [8] graph_1.78.0                           
##   [9] XML_3.99-0.14                          
##  [10] lifecycle_1.0.3                        
##  [11] lattice_0.21-8                         
##  [12] MASS_7.3-60                            
##  [13] insight_0.19.3                         
##  [14] magrittr_2.0.3                         
##  [15] sass_0.4.6                             
##  [16] jquerylib_0.1.4                        
##  [17] yaml_2.3.7                             
##  [18] plotrix_3.8-2                          
##  [19] cowplot_1.1.1                          
##  [20] DBI_1.1.3                              
##  [21] zlibbioc_1.46.0                        
##  [22] purrr_1.0.1                            
##  [23] ggraph_2.1.0                           
##  [24] RCurl_1.98-1.12                        
##  [25] yulab.utils_0.0.6                      
##  [26] tweenr_2.0.2                           
##  [27] rappdirs_0.3.3                         
##  [28] GenomeInfoDbData_1.2.10                
##  [29] enrichplot_1.20.0                      
##  [30] ggrepel_0.9.3                          
##  [31] tidytree_0.4.2                         
##  [32] reactome.db_1.84.0                     
##  [33] codetools_0.2-19                       
##  [34] DelayedArray_0.26.3                    
##  [35] DOSE_3.26.1                            
##  [36] xml2_1.3.4                             
##  [37] ggforce_0.4.1                          
##  [38] tidyselect_1.2.0                       
##  [39] aplot_0.1.10                           
##  [40] farver_2.1.1                           
##  [41] viridis_0.6.3                          
##  [42] BiocFileCache_2.8.0                    
##  [43] GenomicAlignments_1.36.0               
##  [44] jsonlite_1.8.5                         
##  [45] tidygraph_1.2.3                        
##  [46] tools_4.3.0                            
##  [47] progress_1.2.2                         
##  [48] treeio_1.24.1                          
##  [49] TxDb.Hsapiens.UCSC.hg19.knownGene_3.2.2
##  [50] Rcpp_1.0.10                            
##  [51] glue_1.6.2                             
##  [52] gridExtra_2.3                          
##  [53] xfun_0.39                              
##  [54] qvalue_2.32.0                          
##  [55] withr_2.5.0                            
##  [56] fastmap_1.1.1                          
##  [57] boot_1.3-28.1                          
##  [58] fansi_1.0.4                            
##  [59] caTools_1.18.2                         
##  [60] digest_0.6.31                          
##  [61] R6_2.5.1                               
##  [62] gridGraphics_0.5-1                     
##  [63] colorspace_2.1-0                       
##  [64] GO.db_3.17.0                           
##  [65] gtools_3.9.4                           
##  [66] biomaRt_2.56.1                         
##  [67] RSQLite_2.3.1                          
##  [68] utf8_1.2.3                             
##  [69] tidyr_1.3.0                            
##  [70] generics_0.1.3                         
##  [71] data.table_1.14.8                      
##  [72] rtracklayer_1.60.0                     
##  [73] prettyunits_1.1.1                      
##  [74] graphlayouts_1.0.0                     
##  [75] httr_1.4.6                             
##  [76] S4Arrays_1.0.4                         
##  [77] scatterpie_0.2.1                       
##  [78] graphite_1.46.0                        
##  [79] pkgconfig_2.0.3                        
##  [80] gtable_0.3.3                           
##  [81] blob_1.2.4                             
##  [82] XVector_0.40.0                         
##  [83] shadowtext_0.1.2                       
##  [84] htmltools_0.5.5                        
##  [85] fgsea_1.26.0                           
##  [86] scales_1.2.1                           
##  [87] png_0.1-8                              
##  [88] ggfun_0.1.0                            
##  [89] rstudioapi_0.14                        
##  [90] rjson_0.2.21                           
##  [91] nlme_3.1-162                           
##  [92] curl_5.0.1                             
##  [93] cachem_1.0.8                           
##  [94] KernSmooth_2.23-21                     
##  [95] parallel_4.3.0                         
##  [96] HDO.db_0.99.1                          
##  [97] restfulr_0.0.15                        
##  [98] pillar_1.9.0                           
##  [99] grid_4.3.0                             
## [100] vctrs_0.6.3                            
## [101] gplots_3.1.3                           
## [102] dbplyr_2.3.2                           
## [103] evaluate_0.21                          
## [104] cli_3.6.1                              
## [105] locfit_1.5-9.8                         
## [106] compiler_4.3.0                         
## [107] Rsamtools_2.16.0                       
## [108] rlang_1.1.1                            
## [109] crayon_1.5.2                           
## [110] labeling_0.4.2                         
## [111] plyr_1.8.8                             
## [112] stringi_1.7.12                         
## [113] viridisLite_0.4.2                      
## [114] BiocParallel_1.34.2                    
## [115] munsell_0.5.0                          
## [116] Biostrings_2.68.1                      
## [117] lazyeval_0.2.2                         
## [118] GOSemSim_2.26.0                        
## [119] Matrix_1.5-4.1                         
## [120] hms_1.1.3                              
## [121] patchwork_1.1.2                        
## [122] bit64_4.0.5                            
## [123] KEGGREST_1.40.0                        
## [124] highr_0.10                             
## [125] igraph_1.5.0                           
## [126] memoise_2.0.1                          
## [127] bslib_0.5.0                            
## [128] ggtree_3.8.0                           
## [129] fastmatch_1.1-3                        
## [130] bit_4.0.5                              
## [131] downloader_0.4                         
## [132] ape_5.7-1                              
## [133] gson_0.1.0